1. IMPORT

library(ggplot2)
library(tidyverse)
library(dplyr)
library(harmony)
library(Seurat)
library(RColorBrewer)
library(SingleCellExperiment)
# Import seurat object (with log normalization)
seur.human <- readRDS("~/Projects/20220809_Thymic-iNKT-CrossSpecies/data/raw_data/human_data/filtered_seurat_harmony.11.22.22.RDS")

# Sanity check
print(seur.human) # 81,378 cells instead of 79,801 ??
## An object of class Seurat 
## 17212 features across 81378 samples within 1 assay 
## Active assay: RNA (17212 features, 2000 variable features)
##  4 dimensional reductions calculated: pca, initial_umap, harmony, UMAP_50
# Quick visualization
DimPlot(seur.human, reduction="UMAP_50")

# Quick QC
# VlnPlot(seur.human, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), group.by="Batch", ncol = 3) # it looks like poor quality cells were not removed (or not the same thresholds were used)
# seur.human@meta.data[setdiff(colnames(seur.human), colnames(seur.human.sct)),"extracells"] <- TRUE
# DimPlot(seur.human, reduction="UMAP_50", group.by="extracells")
# rownames(seur.human)[stringr::str_detect(rownames(seur.human), "MT")]

2. GET HIGHLY VARIABLE GENES

2.1. Integrated data

hvg.all <- VariableFeatures(seur.human)
length(hvg.all) # 2000
## [1] 2000
# Plot HVGs
LabelPoints(plot=VariableFeaturePlot(seur.human), points=head(VariableFeatures(seur.human), 20), repel=T)

# quick comparison with HVGs from integrated data with SCTransform
# seur.human.sct <- readRDS("~/Projects/20220809_Thymic-iNKT-CrossSpecies/data/raw_data/human_data/filtered_seurat_Harmony_07-22-22.RDS")
# hvg.all.sct <- VariableFeatures(seur.human.sct)
# length(hvg.all.sct) # 3000
# table(hvg.all %in% hvg.all.sct) # only 944 out of 2000 genes are the same...
# LabelPoints(plot=VariableFeaturePlot(seur.human.sct), points=head(VariableFeatures(seur.human.sct), 20), repel=T)

2.2. Function to obtain HVGs

# Extract the counts data (from RNA assay) and metadata
counts <- GetAssayData(object = seur.human, slot = "counts", assay="RNA")
metadata <- seur.human@meta.data
# Create new seurat object with only the counts and the "cleaned" metadata
initseurat <- CreateSeuratObject(counts = counts, meta.data = metadata, min.cells=20) # keep only genes expressed in at least 20 cells
print(initseurat) # sanity check
## An object of class Seurat 
## 17202 features across 81378 samples within 1 assay 
## Active assay: RNA (17202 features, 0 variable features)
# Quick QC check
VlnPlot(initseurat, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

# initseurat <- subset(initseurat, subset = percent.mt < 25)


# ************
# Function to subset the seurat object & return HVGs
getHVG <- function(seuratobject, cells, nfeatures=2000){
  # Subset seurat object
  cat("-- Subset seurat object --\n")
  seur.subset <- subset(seuratobject, subset = group.ident %in% cells)
  cat("Nb of cells in subset object:", dim(seur.subset)[2], "\n")
  
  # Normalize & find variable features
  seur.subset <- NormalizeData(seur.subset)
  seur.subset <- FindVariableFeatures(seur.subset, nfeatures=nfeatures)
  
  # Plot top variable features
  print(LabelPoints(plot=VariableFeaturePlot(seur.subset), points=head(VariableFeatures(seur.subset), 20), repel=T))
  
  # Return HVGs
  return(VariableFeatures(seur.subset))
}

2.3. Subset data by cell lineage & obtain HVGs

# Call function to get HVGs for each cell lineage
hvg.cd4cd8 <- getHVG(seuratobject = initseurat, cells=c("CD4_Thymus", "CD8_Thymus"))
## -- Subset seurat object --
## Nb of cells in subset object: 27427

hvg.nkt    <- getHVG(seuratobject = initseurat, cells="NKT_Thymus")
## -- Subset seurat object --
## Nb of cells in subset object: 2556

hvg.mait   <- getHVG(seuratobject = initseurat, cells="MAIT_Thymus")
## -- Subset seurat object --
## Nb of cells in subset object: 4700

hvg.gdt    <- getHVG(seuratobject = initseurat, cells="GD_Thymus")
## -- Subset seurat object --
## Nb of cells in subset object: 2987

hvg.thymus <- getHVG(seuratobject = initseurat, cells=c("CD4_Thymus", "CD8_Thymus", "NKT_Thymus", "MAIT_Thymus", "GD_Thymus"))
## -- Subset seurat object --
## Nb of cells in subset object: 37670

3. COMPARE HVGs

3.1. Venn diagrams

library(VennDiagram)
library(RColorBrewer)
path.plots <- "~/Projects/20220809_Thymic-iNKT-CrossSpecies/data/06_HumanData/HVGs"


# Between all
grid.newpage()
grid.draw(venn.diagram(
  x = list(hvg.mait, hvg.nkt, hvg.gdt, hvg.cd4cd8, hvg.all),
  category.names = c("thymMAIT" , "thymNKT", "thymGDT", "thymCD4CD8", "Integrated"),
  # filename = file.path(path.plots, 'compareall.jpeg'),
  filename=NULL,
  disable.logging=T,
  # Circles
  lwd = 2,
  lty = 'blank',
  fill =c(brewer.pal(3, "Pastel2"), "#FFF2AE", "#F4CAE4"),
  # Set names
  cat.default.pos = "outer",
  cat.pos = c(0, -20, 240, 150, 20),
  cat.dist = c(0.18, 0.2, 0.22,0.2,0.2),
  cat.cex = 1,
  cat.fontface = "bold",
))

## INFO [2022-12-19 11:39:05] $x
## INFO [2022-12-19 11:39:05] list(hvg.mait, hvg.nkt, hvg.gdt, hvg.cd4cd8, hvg.all)
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $category.names
## INFO [2022-12-19 11:39:05] c("thymMAIT", "thymNKT", "thymGDT", "thymCD4CD8", "Integrated")
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $filename
## INFO [2022-12-19 11:39:05] NULL
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $disable.logging
## INFO [2022-12-19 11:39:05] T
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $lwd
## INFO [2022-12-19 11:39:05] [1] 2
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $lty
## INFO [2022-12-19 11:39:05] [1] "blank"
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $fill
## INFO [2022-12-19 11:39:05] c(brewer.pal(3, "Pastel2"), "#FFF2AE", "#F4CAE4")
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $cat.default.pos
## INFO [2022-12-19 11:39:05] [1] "outer"
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $cat.pos
## INFO [2022-12-19 11:39:05] c(0, -20, 240, 150, 20)
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $cat.dist
## INFO [2022-12-19 11:39:05] c(0.18, 0.2, 0.22, 0.2, 0.2)
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $cat.cex
## INFO [2022-12-19 11:39:05] [1] 1
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $cat.fontface
## INFO [2022-12-19 11:39:05] [1] "bold"
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] [[13]]
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05]
# Between conventional T & integrated data
grid.newpage()
grid.draw(venn.diagram(
  x = list(hvg.all, hvg.cd4cd8),
  category.names = c("Integrated" , "thymCD4CD8"),
  # filename = file.path(path.plots, 'integratedVScd4cd8.jpeg'),
  filename=NULL,
  disable.logging=T,
  # Circles
  lwd = 2,
  lty = 'blank',
  fill = c("#F4CAE4", "#FFF2AE")
))

## INFO [2022-12-19 11:39:05] $x
## INFO [2022-12-19 11:39:05] list(hvg.all, hvg.cd4cd8)
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $category.names
## INFO [2022-12-19 11:39:05] c("Integrated", "thymCD4CD8")
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $filename
## INFO [2022-12-19 11:39:05] NULL
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $disable.logging
## INFO [2022-12-19 11:39:05] T
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $lwd
## INFO [2022-12-19 11:39:05] [1] 2
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $lty
## INFO [2022-12-19 11:39:05] [1] "blank"
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $fill
## INFO [2022-12-19 11:39:05] c("#F4CAE4", "#FFF2AE")
## INFO [2022-12-19 11:39:05]
# length(intersect(hvg.all, hvg.cd4cd8)) # 1176 sanity check


# Between MAIT & integrated data
grid.newpage()
grid.draw(venn.diagram(
  x = list(hvg.all, hvg.mait),
  category.names = c("Integrated" , "thymMAIT"),
  # filename = file.path(path.plots, 'integratedVSmait.jpeg'),
  filename=NULL,
  disable.logging=T,
  # Circles
  lwd = 2,
  lty = 'blank',
  fill = c("#F4CAE4", "#B3E2CD")
))

## INFO [2022-12-19 11:39:05] $x
## INFO [2022-12-19 11:39:05] list(hvg.all, hvg.mait)
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $category.names
## INFO [2022-12-19 11:39:05] c("Integrated", "thymMAIT")
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $filename
## INFO [2022-12-19 11:39:05] NULL
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $disable.logging
## INFO [2022-12-19 11:39:05] T
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $lwd
## INFO [2022-12-19 11:39:05] [1] 2
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $lty
## INFO [2022-12-19 11:39:05] [1] "blank"
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $fill
## INFO [2022-12-19 11:39:05] c("#F4CAE4", "#B3E2CD")
## INFO [2022-12-19 11:39:05]
# Between NKT & integrated data
grid.newpage()
grid.draw(venn.diagram(
  x = list(hvg.all, hvg.nkt),
  category.names = c("Integrated" , "thymNKT"),
  # filename = file.path(path.plots, 'integratedVSnkt.jpeg'),
  filename=NULL,
  disable.logging=T,
  # Circles
  lwd = 2,
  lty = 'blank',
  fill = c("#F4CAE4", "#FDCDAC")
))

## INFO [2022-12-19 11:39:05] $x
## INFO [2022-12-19 11:39:05] list(hvg.all, hvg.nkt)
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $category.names
## INFO [2022-12-19 11:39:05] c("Integrated", "thymNKT")
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $filename
## INFO [2022-12-19 11:39:05] NULL
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $disable.logging
## INFO [2022-12-19 11:39:05] T
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $lwd
## INFO [2022-12-19 11:39:05] [1] 2
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $lty
## INFO [2022-12-19 11:39:05] [1] "blank"
## INFO [2022-12-19 11:39:05] 
## INFO [2022-12-19 11:39:05] $fill
## INFO [2022-12-19 11:39:05] c("#F4CAE4", "#FDCDAC")
## INFO [2022-12-19 11:39:05]
# Between GDT & integrated data
grid.newpage()
grid.draw(venn.diagram(
  x = list(hvg.all, hvg.gdt),
  category.names = c("Integrated" , "thymGDT"),
  # filename = file.path(path.plots, 'integratedVSgdt.jpeg'),
  filename=NULL,
  disable.logging=T,
  # Circles
  lwd = 2,
  lty = 'blank',
  fill = c("#F4CAE4", "#CBD5E8")
))

## INFO [2022-12-19 11:39:06] $x
## INFO [2022-12-19 11:39:06] list(hvg.all, hvg.gdt)
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $category.names
## INFO [2022-12-19 11:39:06] c("Integrated", "thymGDT")
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $filename
## INFO [2022-12-19 11:39:06] NULL
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $disable.logging
## INFO [2022-12-19 11:39:06] T
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $lwd
## INFO [2022-12-19 11:39:06] [1] 2
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $lty
## INFO [2022-12-19 11:39:06] [1] "blank"
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $fill
## INFO [2022-12-19 11:39:06] c("#F4CAE4", "#CBD5E8")
## INFO [2022-12-19 11:39:06]
# Between innate T & integrated data
grid.newpage()
grid.draw(venn.diagram(
  x = list(hvg.mait, hvg.nkt, hvg.gdt, hvg.all),
  category.names = c("thymMAIT" , "thymNKT", "thymGDT", "Integrated"),
  # filename = file.path(path.plots, 'integratedVSinnate.jpeg'),
  filename=NULL,
  disable.logging=T,
  # Circles
  lwd = 2,
  lty = 'blank',
  fill = brewer.pal(4, "Pastel2")
))

## INFO [2022-12-19 11:39:06] $x
## INFO [2022-12-19 11:39:06] list(hvg.mait, hvg.nkt, hvg.gdt, hvg.all)
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $category.names
## INFO [2022-12-19 11:39:06] c("thymMAIT", "thymNKT", "thymGDT", "Integrated")
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $filename
## INFO [2022-12-19 11:39:06] NULL
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $disable.logging
## INFO [2022-12-19 11:39:06] T
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $lwd
## INFO [2022-12-19 11:39:06] [1] 2
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $lty
## INFO [2022-12-19 11:39:06] [1] "blank"
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $fill
## INFO [2022-12-19 11:39:06] brewer.pal(4, "Pastel2")
## INFO [2022-12-19 11:39:06]
# Between all T cell lineages
grid.newpage()
grid.draw(venn.diagram(
  x = list(hvg.mait, hvg.nkt, hvg.gdt, hvg.cd4cd8),
  category.names = c("thymMAIT" , "thymNKT", "thymGDT", "thymCD4CD8"),
  # filename = file.path(path.plots, 'thymicT.jpeg'),
  filename=NULL,
  disable.logging=T,
  # Circles
  lwd = 2,
  lty = 'blank',
  fill = c(brewer.pal(3, "Pastel2"), "#FFF2AE")
))

## INFO [2022-12-19 11:39:06] $x
## INFO [2022-12-19 11:39:06] list(hvg.mait, hvg.nkt, hvg.gdt, hvg.cd4cd8)
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $category.names
## INFO [2022-12-19 11:39:06] c("thymMAIT", "thymNKT", "thymGDT", "thymCD4CD8")
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $filename
## INFO [2022-12-19 11:39:06] NULL
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $disable.logging
## INFO [2022-12-19 11:39:06] T
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $lwd
## INFO [2022-12-19 11:39:06] [1] 2
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $lty
## INFO [2022-12-19 11:39:06] [1] "blank"
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $fill
## INFO [2022-12-19 11:39:06] c(brewer.pal(3, "Pastel2"), "#FFF2AE")
## INFO [2022-12-19 11:39:06]
# Between innate T cell lineages
grid.newpage()
grid.draw(venn.diagram(
  x = list(hvg.mait, hvg.nkt, hvg.gdt),
  category.names = c("thymMAIT" , "thymNKT", "thymGDT"),
  # filename = file.path(path.plots, 'innateT.jpeg'),
  filename=NULL,
  disable.logging=T,
  # Circles
  lwd = 2,
  lty = 'blank',
  fill = brewer.pal(3, "Pastel2")
))

## INFO [2022-12-19 11:39:06] $x
## INFO [2022-12-19 11:39:06] list(hvg.mait, hvg.nkt, hvg.gdt)
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $category.names
## INFO [2022-12-19 11:39:06] c("thymMAIT", "thymNKT", "thymGDT")
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $filename
## INFO [2022-12-19 11:39:06] NULL
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $disable.logging
## INFO [2022-12-19 11:39:06] T
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $lwd
## INFO [2022-12-19 11:39:06] [1] 2
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $lty
## INFO [2022-12-19 11:39:06] [1] "blank"
## INFO [2022-12-19 11:39:06] 
## INFO [2022-12-19 11:39:06] $fill
## INFO [2022-12-19 11:39:06] brewer.pal(3, "Pastel2")
## INFO [2022-12-19 11:39:06]

3.2. Plot HVGs on expression x variance plot

# MAIT vs integrated
setdiff(hvg.mait, hvg.all)[1:10]
##  [1] "BLK"        "FGF11"      "AC005726.5" "WIPF3"      "SMARCA1"   
##  [6] "ACOT9"      "ZNF257"     "FAM174A"    "FAM13A.AS1" "TRAK2"
setdiff(hvg.all, hvg.mait)[1:10]
##  [1] "IGKC"   "IGHM"   "JCHAIN" "LYZ"    "KRT1"   "TRDC"   "STMN1"  "HBEGF" 
##  [9] "IRF8"   "FCN1"
LabelPoints(plot=VariableFeaturePlot(seur.human, log=T), points=head(setdiff(hvg.mait, hvg.all), 20), repel=T, ynudge = 5, xnudge=10)

table("NR4A1" %in% setdiff(hvg.mait, hvg.all))
## 
## FALSE 
##     1
# MAIT-specific HVG
FeaturePlot(seur.human, reduction="UMAP_50", features="BLK", order=T) + ggtitle("BLK | integrated") |
  FeaturePlot(subset(seur.human, subset= group.ident=="MAIT_Thymus"), reduction="UMAP_50", features="BLK", order=T) + ggtitle("BLK | thymMAIT")

# ggsave("~/Projects/20220809_Thymic-iNKT-CrossSpecies/data/06_HumanData/HVGs/mait-specific-HVG_BLKexample.jpeg", width=10, height=5)

# Integrated data-specific HVG
FeaturePlot(seur.human, reduction="UMAP_50", features="JCHAIN", order=T) + ggtitle("JCHAIN | integrated") |
  FeaturePlot(subset(seur.human, subset= group.ident=="MAIT_Thymus"), reduction="UMAP_50", features="JCHAIN", order=T) + ggtitle("JCHAIN | thymMAIT")

# ggsave("~/Projects/20220809_Thymic-iNKT-CrossSpecies/data/06_HumanData/HVGs/integrated-specific-HVG_JCHAINexample.jpeg", width=10, height=5)